#remove scientific notation
options(scipen=999)

—- Load packages —-

library(stringr)
library(corrplot)
## corrplot 0.84 loaded

—- Load neighborhood data —-

load("Data/county_factors.rda")
load("Data/county_500CitiesData.rda")

—- Load and format covid data —-

data.path <- "Data/COVID-19/csse_covid_19_data/csse_covid_19_time_series/"

# Read in the data
US.deaths <- read.csv(
  paste0(data.path, "time_series_covid19_deaths_US.csv"), 
  header = T, stringsAsFactors = F)
US.cases <- read.csv(
  paste0(data.path, "time_series_covid19_confirmed_US.csv"), 
  header = T, stringsAsFactors = F)

# Read in the header seprately.
US.cases.head <- read.csv(
  paste0(data.path, "time_series_covid19_confirmed_US.csv"), 
  header = F, stringsAsFactors = F)[1,]
US.deaths.head <- read.csv(
  paste0(data.path, "time_series_covid19_deaths_US.csv"), 
  header = F, stringsAsFactors = F)[1,]

# Correct the dates in the header to be more useable as
# column names.
proper_date <- function(dates){
  dates <- sapply(dates, strsplit, split = "/")
  dates <- lapply(dates, str_pad, width = 2, side = "left", pad = "0")
  dates <- lapply(dates, paste, collapse = "_")
  dates <- unlist(dates)
  
  return(dates)
}

dates.cases <- proper_date(US.cases.head[-c(1:11)])
dates.deaths <- proper_date(US.deaths.head[-c(1:12)])

names(US.cases) <- c(US.cases.head[1,1:11], dates.cases)
names(US.deaths) <- c(US.deaths.head[1,1:12], dates.deaths)

if(sum(US.cases$UID != US.deaths$UID) > 0){warning("COVID data rows do not match!")}
US.cases$Population <- US.deaths$Population
US.cases <- US.cases[,c(1:11, ncol(US.cases), 12:(ncol(US.cases)-1))]

Split the dataset into the data and the info for usability.

US.cases.info <- as.matrix(US.cases[,1:12])
US.cases.data <- as.matrix(US.cases[,-c(2:12)])
US.deaths.info <- as.matrix(US.deaths[,1:12])
US.deaths.data <- as.matrix(US.deaths[,-c(2:12)])

rownames(US.cases.info) <- US.cases.info[,1]
US.cases.info <- US.cases.info[,-1]
rownames(US.cases.data) <- US.cases.data[,1]
US.cases.data <- US.cases.data[,-1]
rownames(US.deaths.info) <- US.deaths.info[,1]
US.deaths.info <- US.deaths.info[,-1]
rownames(US.deaths.data) <- US.deaths.data[,1]
US.deaths.data <- US.deaths.data[,-1]


ndays.cases <- ncol(US.cases.data)
ndays.deaths <- ncol(US.deaths.data)

nobs <- nrow(US.cases.data)

—- The Curve —-

state.curve <- function(state, stat = c("cases", "deaths"), logScale = T){
  if(stat == "cases"){
    data <- US.cases.data[which(US.cases$Province_State == state),]
  }else if(stat == "deaths"){
    data <- US.deaths.data[which(US.deaths$Province_State == state),]
  }
  data.sum <- colSums(data)
  day.first.case <- min(which(data.sum > 0))
  n.days <- length(data.sum)
  
  if(logScale == T){
  barplot(data.sum[day.first.case:n.days], 
          main = paste0("Total COVID-19 ", stat," by date in ", state, ", log scale"), 
          log = "y", las = 2, cex.axis = 1, cex.names = 0.5)
  }else{
    barplot(data.sum[day.first.case:n.days], 
          main = paste0("Total COVID-19 ", stat," by date in ", state), 
          las = 2, cex.axis = 1, cex.names = 0.5)
  }
}
state.rise <- function(state, stat = c("cases", "deaths")){
  if(stat == "cases"){
    data.thisState <- US.cases.data[which(US.cases$Province_State == state),]
  }else if(stat == "deaths"){
    data.thisState <- US.deaths.data[which(US.deaths$Province_State == state),]
  }
  
  data.sum <- colSums(data.thisState)
  n.days <- ncol(data.thisState)
  
  rise.cases <- matrix(ncol = n.days - 1, nrow = 1)
  colnames(rise.cases) <- colnames(data.thisState)[-1]
  for(i in 1:ncol(rise.cases) + 1){
    rise <- data.sum[i] - data.sum[i-1]
    rise.cases[i-1] <- rise
  }
  
  day.first.case <- min(which(rise.cases > 0))
  n.days <- length(rise.cases)
  
  barplot(rise.cases[,day.first.case:n.days], main = paste0("Rise in COVID-19 ", stat, " by Date in ", state), las = 2, cex.axis = 1, cex.names = 0.5)
}

state.curve("New York", "cases")
state.curve("New York", "cases")

state.curve("New York", "deaths")

state.rise("New York", "cases")

state.rise("New York", "deaths")

state.curve("California", "cases")

state.curve("California", "deaths")

state.rise("California", "cases")

state.rise("California", "deaths")

state.curve("Oklahoma", "cases")

state.curve("Oklahoma", "deaths")

state.rise("Oklahoma", "cases")

state.rise("Oklahoma", "deaths")

state.curve("Georgia", "cases")

state.curve("Georgia", "deaths")

state.rise("Georgia", "cases")

state.rise("Georgia", "deaths")

county.curve <- function(county, stat = c("cases", "deaths")){
  if(stat == "cases"){
    data <- US.cases.data[which(US.cases$Admin2 == county),]
  }else if(stat == "deaths"){
    data <- US.deaths.data[which(US.deaths$Admin2 == county),]
  }
  
  day.first.case <- min(which(data > 0))
  n.days <- length(data)
  
  barplot(data[day.first.case:n.days], main = paste0("Total COVID-19 ", stat," by date in ", county), log = "y", las = 2, cex.axis = 1, cex.names = 0.5)
  
}

county.curve("Tulsa", "cases")

county.curve("Tulsa", "deaths")

—- Calculate some useful stats to compare with neighborhood data —-

US.stats <- data.frame(UID = US.cases$UID)
cases.total <- colSums(US.cases.data)

day.first.case <- min(which(cases.total > 100))
n.days <- length(cases.total)

par(mar = c(5,5,4,2))
barplot(cases.total[day.first.case:n.days], main = "Total COVID-19 cases by Date in US", las = 2, cex.axis = 1, cex.names = 0.5)

barplot(cases.total[day.first.case:n.days], main = "Total COVID-19 cases by Date in US, log scale", las = 2, cex.axis = 1, cex.names = 0.5, log = "y")

deaths.total <- colSums(US.deaths.data)

day.first.case <- min(which(deaths.total > 0))
n.days <- length(deaths.total)

barplot(deaths.total[day.first.case:n.days], main = "Total COVID-19 deaths by Date in US", las = 2, cex.axis = 1, cex.names = 0.5)

barplot(deaths.total[day.first.case:n.days], main = "Total COVID-19 deaths by Date in US, log scale", las = 2, cex.axis = 1, cex.names = 0.5, log = "y")

Average rise in cases per day

avg.rise.cases

rise.cases <- matrix(ncol = ndays.cases - 1, nrow = nobs)
colnames(rise.cases) <- colnames(US.cases.data)[-1]
for(i in 1:ncol(rise.cases) + 1){
  rise <- US.cases.data[,i] - US.cases.data[,i-1]
  rise.cases[,i-1] <- rise
}

US.stats$avg.rise.cases <- apply(rise.cases, 1, mean)

rise.cases.total <- colSums(rise.cases)

day.first.case <- min(which(rise.cases.total > 0))
n.days <- length(rise.cases.total)

barplot(rise.cases.total[day.first.case:n.days], main = "Rise in Cases of COVID-19 by Date in US", las = 2, cex.axis = 1, cex.names = 0.5)

Average rise in deaths per day

avg.rise.deaths

rise.deaths <- matrix(ncol = ndays.deaths - 1, nrow = nobs)
colnames(rise.deaths) <- colnames(US.deaths.data)[-1]
for(i in 1:ncol(rise.deaths) + 1){
  rise <- US.deaths.data[,i] - US.deaths.data[,i-1]
  rise.deaths[,i-1] <- rise
}

US.stats$avg.rise.deaths <- apply(rise.deaths, 1, mean)

rise.deaths.total <- colSums(rise.deaths)

day.first.case <- min(which(rise.deaths.total > 0))
n.days <- length(rise.deaths.total)

barplot(rise.deaths.total[day.first.case:n.days], main = "Rise in Deaths of COVID-19 by Date in US", las = 2, cex.axis = 1, cex.names = 0.5)

Total cases

total.cases

US.stats$total.cases <- US.cases.data[,ndays.cases]

Total cases per capita

US.stats$total.cases.percap <- US.stats$total.cases / US.cases$Population
US.stats$total.cases.percap[US.cases$Population == 0] <- NA
hist(US.stats$total.cases.percap)

Total deaths

total.deaths

US.stats$total.deaths <- US.deaths.data[,ndays.deaths]

Total deaths per capita

total.deaths.percap

US.stats$total.deaths.percap <- US.stats$total.deaths / US.deaths$Population
US.stats$total.deaths.percap[US.deaths$Population == 0] <- NA

max(US.stats$total.deaths.percap,na.rm = T)
## [1] 0.002565304

Total deaths per case

total.deaths.percase Error in Johns Hopkins data has rows with total.deaths > total.cases.

# pos.case.ind <- US.stats$total.cases > 0
# US.stats$total.deaths.percase[pos.case.ind] <- US.stats$total.deaths[pos.case.ind] / US.stats$total.cases[pos.case.ind]
# US.stats$total.deaths.percase[!pos.case.ind] <- 0
US.stats$total.deaths.percase <- US.stats$total.deaths / US.stats$total.cases
US.stats$total.deaths.percase[US.stats$total.cases == 0] <- NA

US.stats[which(US.stats$total.deaths > US.stats$total.cases),]
##           UID avg.rise.cases avg.rise.deaths total.cases
## 3203 84090002      0.0000000      0.04444444           0
## 3206 84090006      0.0000000      0.02222222           0
## 3222 84090024      0.0000000      1.33333333           0
## 3229 84090031      0.0000000      0.16666667           0
## 3231 84090033      0.1555556      0.43333333          14
## 3234 84090036      0.0000000      0.03333333           0
## 3248 84090051      0.0000000      1.55555556           0
## 3250 84090054      0.0000000      0.21111111           0
## 3252 84090056      0.0000000      0.01111111           0
##      total.cases.percap total.deaths total.deaths.percap
## 3203                 NA            4                  NA
## 3206                 NA            2                  NA
## 3222                 NA          120                  NA
## 3229                 NA           15                  NA
## 3231                 NA           39                  NA
## 3234                 NA            3                  NA
## 3248                 NA          140                  NA
## 3250                 NA           19                  NA
## 3252                 NA            1                  NA
##      total.deaths.percase
## 3203                   NA
## 3206                   NA
## 3222                   NA
## 3229                   NA
## 3231             2.785714
## 3234                   NA
## 3248                   NA
## 3250                   NA
## 3252                   NA

—- Merge COVID data with Neighborhood data —-

US.stats$ID <- str_pad(US.stats$UID, 8, "left", pad = "0")
US.stats$ID <- substr(US.stats$ID, 4, 8)

data.merge <- merge(US.stats, county_factors, by = "ID")

—- Plot the relationships —-

data.cor <- cor(data.merge[,-c(1:2)], use = "complete.obs", method = "spearman")
corrplot.mixed(data.cor, upper = 'ellipse', lower = 'number', tl.pos = 'lt', tl.cex = 1, lower.col = "black", number.cex = 0.5)

data.merge2 <- merge(data.merge, county_500CitiesData, by = "ID", all.x = F)

—- Plot the relationships —-

data.cor2 <- cor(data.merge2[,-c(1:2)], use = "complete.obs", method = "spearman")
corrplot.mixed(data.cor2, upper = 'ellipse', lower = 'number', tl.pos = 'lt', tl.cex = 1, lower.col = "black", number.cex = 0.5)

corrplot.mixed(data.cor2[1:7,8:42], upper = 'ellipse', lower = 'number', tl.pos = 'lt', tl.cex = 1, lower.col = "black", number.cex = 0.5)